library(modeltime)
library(dplyr)
library(EIAapi)
library(jsonlite)
library(gt)
library(plotly)
library(lubridate)
library(modeltime)
source("../pipeline/eia_data.R")
source("../pipeline/backtesting.R")Data Refrsh
Load libraries
API Settings:
meta_json <- read_json(path = "../settings/settings.json")
s <- meta_json$series
series <- lapply(1:length(s), function(i) {
return(data.frame(
parent_id = s[[i]]$parent_id,
parent_name = s[[i]]$parent_name,
subba_id = s[[i]]$subba_id,
subba_name = s[[i]]$subba_name
))
}) |>
bind_rows()
facets_template <- list(
parent = NULL,
subba = NULL
)
eia_api_key <- Sys.getenv("EIA_API_KEY")
api_path <- meta_json$api_path
meta_path <- meta_json$meta_path
data_path <- meta_json$data_path
forecast_path <- meta_json$forecast_path
forecast_log_path <- meta_json$forecast_log_path
calibrated_models_path <- meta_json$calibrated_models_path
h <- meta_json$backtesting$h
lags <- meta_json$backtesting$features$lags |> unlist()
train_length <- meta_json$train_length
offset <- meta_json$offset
tz <- meta_json$timezone
models_settings <- meta_json$backtesting$models
init <- FALSE
save <- TRUEmeta_obj <- get_metadata(api_key = eia_api_key, api_path = api_path, meta_path = meta_path, series = series)
gt(meta_obj$request_meta)| parent | subba | end_act | request_start | end | updates_available |
|---|---|---|---|---|---|
| CISO | PGAE | 2024-11-01 07:00:00 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | TRUE |
| CISO | SCE | 2024-11-01 07:00:00 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | TRUE |
| CISO | SDGE | 2024-11-01 07:00:00 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | TRUE |
| CISO | VEA | 2024-11-01 07:00:00 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | TRUE |
m <- meta_obj$request_meta
index <- meta_obj$last_index + 1
data <- NULL
meta_new <- NULL
for (i in 1:nrow(m)) {
facets <- facets_template
facets$parent <- m$parent[i]
facets$subba <- m$subba[i]
start <- m$request_start[i]
end <- m$end[i]
print(paste(facets$parent, facets$subba, sep = " - "))
if (m$updates_available[i]) {
temp <- eia_backfill(
start = start - lubridate::hours(24),
end = end + lubridate::hours(24),
offset = offset,
api_key = eia_api_key,
api_path = paste(api_path, "data", sep = ""),
facets = facets
) |> dplyr::filter(time >= start & time <= end)
index <- seq.POSIXt(from = start, to = end, by = "hour")
ts_obj <- data.frame(period = index) |>
left_join(temp, by = c("period" = "time"))
data_available <- TRUE
} else {
ts_obj <- NULL
print("No new data is available")
data_available <- FALSE
}
meta_temp <- create_metadata(data = ts_obj, start = start, end = end, type = "refresh")
if (is.null(ts_obj)) {
meta_temp$parent <- m$parent[i]
meta_temp$subba <- m$subba[i]
}
if (meta_temp$success) {
print("Append the new data")
d <- append_data(data_path = data_path, new_data = ts_obj, save = TRUE)
meta_temp$update <- TRUE
} else {
meta_temp$update <- FALSE
if (data_available && !meta_temp$success) {
meta_temp$comments <- paste(meta_temp$comments, "The data refresh failed, please check the log; ", sep = "")
}
}
meta_temp$index <- NA
meta_df <- as.data.frame(meta_temp)
if (!is.null(ts_obj)) {
data <- bind_rows(data, ts_obj)
}
meta_new <- bind_rows(meta_new, meta_df)
}[1] "CISO - PGAE"
[1] "CISO - SCE"
[1] "CISO - SDGE"
[1] "CISO - VEA"
gt(meta_new)| index | parent | subba | time | start | end | start_act | end_act | start_match | end_match | n_obs | na | type | update | success | comments |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | CISO | PGAE | 2025-01-07 08:19:44.610963 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | 2024-11-03 08:00:00 | 2025-01-06 08:00:00 | FALSE | TRUE | 1585 | 48 | refresh | FALSE | FALSE | The start argument does not match the actual; Missing values were found; The data refresh failed, please check the log; |
| NA | CISO | SCE | 2025-01-07 08:19:46.38831 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | 2024-11-03 08:00:00 | 2025-01-06 08:00:00 | FALSE | TRUE | 1585 | 48 | refresh | FALSE | FALSE | The start argument does not match the actual; Missing values were found; The data refresh failed, please check the log; |
| NA | CISO | SDGE | 2025-01-07 08:19:48.281468 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | 2024-11-03 08:00:00 | 2025-01-06 08:00:00 | FALSE | TRUE | 1585 | 48 | refresh | FALSE | FALSE | The start argument does not match the actual; Missing values were found; The data refresh failed, please check the log; |
| NA | CISO | VEA | 2025-01-07 08:19:49.840905 | 2024-11-01 08:00:00 | 2025-01-06 08:00:00 | 2024-11-03 08:00:00 | 2025-01-06 08:00:00 | FALSE | TRUE | 1585 | 48 | refresh | FALSE | FALSE | The start argument does not match the actual; Missing values were found; The data refresh failed, please check the log; |
meta_updated <- append_metadata(meta_path = meta_path, new_meta = meta_new, save = TRUE, init = FALSE)[1] "Saving the metadata file"
Plot the Series
We will use Plotly to visualize the series:
if (!is.null(data)) {
d <- data |> arrange(subba, period)
p <- plot_ly(d, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")
p
} else {
print("No new data is available")
}data <- readr::read_csv(file = data_path, col_types = readr::cols(
period = readr::col_datetime(format = ""),
subba = readr::col_character(),
subba_name = readr::col_character(),
parent = readr::col_character(),
parent_name = readr::col_character(),
value = readr::col_double(),
value_units = readr::col_character()
))
head(data)# A tibble: 6 x 8
period subba subba_name parent parent_name value value_units
<dttm> <chr> <chr> <chr> <chr> <dbl> <chr>
1 2019-01-01 01:00:00 PGAE Pacific Gas an~ CISO California~ 11256 megawattho~
2 2019-01-01 02:00:00 PGAE Pacific Gas an~ CISO California~ 12501 megawattho~
3 2019-01-01 03:00:00 PGAE Pacific Gas an~ CISO California~ 12513 megawattho~
4 2019-01-01 04:00:00 PGAE Pacific Gas an~ CISO California~ 12094 megawattho~
5 2019-01-01 05:00:00 PGAE Pacific Gas an~ CISO California~ 11604 megawattho~
6 2019-01-01 06:00:00 PGAE Pacific Gas an~ CISO California~ 11166 megawattho~
# i 1 more variable: type <chr>
p <- plot_ly(data, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")
pRefresh the forecast
fc <- NULL
fc <- refresh_forecast(
input = data,
forecast_log_path = forecast_log_path,
forecast_path = forecast_path,
models_settings = models_settings,
h = h,
index = "period",
var = "value",
train_length = 24 * 31 * 25,
lags = lags,
init = init,
save = save,
seasonal = TRUE,
trend = TRUE
)Checking if new data points are available to refresh the forecast
There is no sufficent data to refresh the froecast
if (!is.null(fc)) {
head(fc)
plot_forecast(
input = data,
forecast = fc,
var = "value",
index = "period",
hours = 24 * 3
)
}Score the Forecast
fc_log <- load_forecast_log(forecast_log_path = forecast_log_path)
score_rows <- which(!fc_log$score)
if (length(score_rows) == 0) {
message("All models were scored ")
} else {
subba <- unique(fc_log$subba[score_rows])
fc <- load_forecast(forecast_path = forecast_path)
for (i in subba) {
print(i)
d <- data |> dplyr::filter(subba == i)
r <- which(fc_log$subba == i & !fc_log$score)
for (l in r) {
f <- fc |>
dplyr::filter(forecast_label == fc_log$forecast_label[l], subba == i) |>
dplyr::left_join(d |> dplyr::select(time = period, subba, value), by = c("time", "subba")) |>
dplyr::filter(!is.na(value))
fc_log$mape[l] <- mean(abs(f$value - f$yhat) / f$value)
fc_log$rmse[l] <- (mean((f$value - f$yhat)^2))^0.5
fc_log$coverage[l] <- length(which(f$value <= f$upper & f$value >= f$lower)) / nrow(f)
if (nrow(f) == fc_log$h[l]) {
fc_log$score[l] <- TRUE
}
write.csv(fc_log, forecast_log_path, row.names = FALSE)
}
}
gt::gt(fc_log[score_rows, ])
}[1] "SCE"
[1] "PGAE"
[1] "SDGE"
[1] "VEA"
| index | subba | model | method | time | forecast_label | start | end | h | n_obs | n_obs_flag | na_flag | success | score | mape | rmse | coverage |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 162 | SCE | GLMNET | model18 | 2024-08-24 08:14:07.2153 | 2024-08-24 | 2024-08-23 | 2024-08-24 07:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.06509622 | 1113.04728 | 0.7755102 |
| 165 | PGAE | LM | model6 | 2024-09-11 23:57:44.710466 | 2024-08-25 | 2024-07-08 | 2024-07-09 01:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.04352206 | 687.79296 | 1.0000000 |
| 166 | SCE | GLMNET | model18 | 2024-09-11 23:57:44.712173 | 2024-08-24 | 2024-07-08 | 2024-07-09 01:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.06509622 | 1113.04728 | 0.7755102 |
| 269 | PGAE | LM | model6 | 2024-10-16 14:15:05.359172 | 2024-10-16 | 2024-10-16 | 2024-10-16 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.02862341 | 519.01880 | 0.8000000 |
| 270 | SCE | GLMNET | model18 | 2024-10-16 14:15:05.361531 | 2024-10-16 | 2024-10-16 | 2024-10-16 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.02839913 | 485.05785 | 0.9250000 |
| 329 | PGAE | LM | model6 | 2024-11-01 08:19:20.961854 | 2024-11-01 | 2024-11-01 | 2024-11-01 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.02636641 | 352.06331 | 1.0000000 |
| 330 | SCE | GLMNET | model18 | 2024-11-01 08:19:20.964311 | 2024-11-01 | 2024-11-01 | 2024-11-01 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.03687811 | 459.28437 | 0.8750000 |
| 331 | SDGE | LM | model7 | 2024-11-01 08:19:20.966741 | 2024-11-01 | 2024-11-01 | 2024-11-01 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.04747243 | 124.92836 | 1.0000000 |
| 332 | VEA | LM | model5 | 2024-11-01 08:19:20.969141 | 2024-11-01 | 2024-11-01 | 2024-11-01 23:00:00 | 24 | 24 | TRUE | FALSE | TRUE | FALSE | 0.15922518 | 11.55274 | 0.8750000 |